“In the name of God, I, take you, to be my husband/wife, to have and to hold from this day forward, for better or worse, for richer or poorer, in sickness and in health, to love and to cherish all the days of my life. This is my solemn vow.”
So you still believe in fairy tale?
Time to wake up.
In this project, we seek to find out features that may affect people’s marriage status. Are they truly loyal to their wedding vows? Or are there certain patterns or factors that may lead their marriage to end?
We choose ‘marital status’ and ‘number of times married’ as measurement of a person’s performance on marriage. By relating this to other factors, such as income, eduction level, or health status, we wish to find some patterns upon them.
We may find answers for some interesting questions, for example, does high income help people keep marriage steady? Should we listen to parents’ suggestions to break up with boyfriend who has a severe disease or disability? Is it a good idea to get married with a person who has a doctor degree?
Our goal is to find related factors that may bring negative effects to marriage, and avoid them.
We tag marriage performance by two measurement: ‘Marital Status’ and ‘Number of Times Married’.
fac = 1 (Good Marriage Performance): the number of times married is one and the marital status is ‘not divorced’ now, we think this person does very well in marriage.
fac = 2 (Poor Marriage Performance): the number of times married is one and the marital status is ‘divorced’ now, or the number of times is two and the marital status is ‘not divorced’ now, we think this person performs not bad in marriage.
fac = 3 (Inferior Marriage Performance): the number of times married is three, or this number is two and the marital status is ‘divorced’ now, we think this person does very poor in marriage.
databad <- data_ori[(MAR == 3 | MARHT >= 2)]
datagood <- data_ori[(MAR != 3 & MARHT < 2)]
#Define a good marriage attitude
datagood[, fac := 1, ]
#Define a poor marriage attitude
databad1 <- databad[!(MARHT >= 3 | (MARHT == 2 & MAR == 3))]
databad1[, fac := 2, ]
#Define a inferior marriage attitude
databad2 <- databad[MARHT >= 3 | (MARHT == 2 & MAR == 3)]
databad2[, fac := 3, ]
data_f <- rbind(datagood, databad1, databad2)
data_dis <- data_final[!is.na(DIS), list(DIS, fac)]
data_RAC1P <- data_final[RAC1P==1|RAC1P==2|RAC1P==3|RAC1P==6, list(RAC1P, fac)]
data_SCHL <- data_final[SCHL==1|SCHL==16|SCHL==21|SCHL==22|SCHL==24, list(SCHL, fac)]
data_WAGP <- data_final[!is.na(WAGP), list(WAGP, fac)]
data_VPS <- data_final[VPS==1|VPS==6|VPS==11, list(VPS, fac)]
data_MIL <- data_final[!is.na(MIL), list(MIL, fac)]
We removed all NA and balanced weight for the sample data.
data_all_var <- data_final[!is.na(DIS) & (RAC1P==1|RAC1P==2|RAC1P==3|RAC1P==6) & (SCHL==1|SCHL==16|SCHL==21|SCHL==22|SCHL==24) & !is.na(WAGP) & !is.na(MIL), list(DIS, RAC1P, SCHL, WAGP, MIL, PWGTP, fac)]
#load disability file
allvar <- fread('updated_data_all_var.csv')
allvar <- data.frame(allvar)
#clustering based on wage
allvar$WAGP <- as.array(kmeans(allvar$WAGP, 10, nstart = 20)$cluster)
#head(allvar)
# 1.DIC 2. RAC1P 3.SCHL 4.WAGE 5.MIL 6.PWGTP 7.fac
#######
#1.wage -> dis
f.dis <- function(col1){
a <- sort(unique(allvar[,col1]))
b <- sort(unique(allvar$DIS))
results <- matrix(NA, nrow = length(a), ncol = length(b) )
cou = 1
for (i in 1:length(a)){
flow <- count(allvar[which(allvar[,col1]==a[i]),],DIS, wt = PWGTP)
colnames(results) <- sort(b)
rownames(results) <- sort(a)
results[cou,] <-flow$n
cou <- cou + 1
}
return(results)
}
# Wage to fac
f.dis(4)
#######
#2.dis -> edu
unique(allvar$DIS)
f.edu <- function(col1){
a <- sort(unique(allvar[,col1]))
b <- sort(unique(allvar$SCHL))
results <- matrix(NA, nrow = length(a), ncol = length(b) )
cou = 1
for (i in 1:length(a)){
flow <- count(allvar[which(allvar[,col1]==a[i]),],SCHL, wt = PWGTP)
colnames(results) <- sort(b)
rownames(results) <- sort(a)
results[cou,] <-flow$n
cou <- cou + 1
}
return(results)
}
# Dic to edu
f.edu(1)
#######
#3. edu -> race
unique(allvar$SCHL)
unique(allvar$RAC1P)
f.rac <- function(col1){
a <- sort(unique(allvar[,col1]))
b <- sort(unique(allvar$RAC1P))
results <- matrix(NA, nrow = length(a), ncol = length(b) )
cou = 1
for (i in 1:length(a)){
flow <- count(allvar[which(allvar[,col1]==a[i]),],RAC1P, wt = PWGTP)
colnames(results) <- sort(b)
rownames(results) <- sort(a)
results[cou,] <-flow$n
cou <- cou + 1
}
return(results)
}
# Edu to race
f.rac(3)
#######
#4. race -> millitary
unique(allvar$RAC1P)
unique(allvar$MIL)
f.mil <- function(col1){
a <- sort(unique(allvar[,col1]))
b <- sort(unique(allvar$MIL))
results <- matrix(NA, nrow = length(a), ncol = length(b) )
cou = 1
for (i in 1:length(a)){
flow <- count(allvar[which(allvar[,col1]==a[i]),],MIL, wt = PWGTP)
colnames(results) <- sort(b)
rownames(results) <- sort(a)
results[cou,] <-flow$n
cou <- cou + 1
}
return(results)
}
# race to millitary
f.mil(2)
########################################
# each variable numbers flow to fac
f <- function(col1){ #col1 ->col2
a <- sort(unique(allvar[,col1]))
b <- sort(unique(allvar$fac))
results <- matrix(NA, nrow = length(a), ncol = length(b) )
cou = 1
for (i in 1:length(a)){
flow <- count(allvar[which(allvar[,col1]==a[i]),],fac, wt = PWGTP)
colnames(results) <- sort(b)
rownames(results) <- sort(a)
results[cou,] <-flow$n
cou <- cou + 1
}
return(results)
}
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: rCharts
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rCharts'
## Loading required package: plyr
#plot
##wages:
# 1.box-plots for wage
p <- plot_ly(ggplot2::diamonds, y = data_raw$WAGP , color = data_raw$fac , type = "box")
p
# 2. bar chart for wage
p <- plot_ly(
x = c("A","B","C","D","E","F","G","H","I","J"),
y = c(377034, 25113657, 5645546, 7056107, 944659, 383814, 4157641, 2427844, 6522414, 6172354),
name = "good marriage performance",
type = "bar")
p2 <- add_trace(
p,
x = c("A","B","C","D","E","F","G","H","I","J"),
y = c(114668, 9748396, 2379307, 2767572, 257943, 95312, 1315064, 696605, 2822095, 2182342),
name = "poor marriage performance",
type = "bar")
p3 <- add_trace(
p2,
x = c("A","B","C","D","E","F","G","H","I","J"),
y = c(30508, 3396854, 696597, 708733, 46991, 18961, 278326, 161293, 758216, 517508),
name = "inferior marriage performance",
type = "bar")
p4 <- layout(p3, barmode = "stack")
p4
# 3.bar chart for race
library(plotly)
p <- plot_ly(
x = c("white", "black/africa Am.", "Am. Indian", "Asian"),
y = c(48622273,4629716,204187,5344894),
name = "good marriage performance",
type = "bar")
p2 <- add_trace(
p,
x = c("white", "black/africa Am.", "Am. Indian", "Asian"),
y = c(430131,11367450,6906974,3135660,539089),
name = "poor marriage performance",
type = "bar")
p3 <- add_trace(
p2,
x = c("white", "black/africa Am.", "Am. Indian", "Asian"),
y = c(148481,3797101,1722793,797134,148478),
name = "inferior marriage performance",
type = "bar")
p4 <- layout(p3, barmode = "stack")
p4
##4. barchart for education
library(plotly)
p <- plot_ly(
x = c("No completed school", "high school", "bachelor", "master", "doctorate"),
y = c(1372324,23810758,2180033,1725501),
name = "good marriage performance",
type = "bar")
p2 <- add_trace(
p,
x = c("No completed school", "high school", "bachelor", "master", "doctorate"),
y = c(430131,11367450,6906974,3135660,539089),
name = "poor marriage performance",
type = "bar")
p3 <- add_trace(
p2,
x = c("No completed school", "high school", "bachelor", "master", "doctorate"),
y = c(148481,3797101,1722793,148478),
name = "inferior marriage performance",
type = "bar")
p4 <- layout(p3, barmode = "stack")
p4
##5. barchart for military
p <- plot_ly(
x = c("No duty", "Duty in pust", "Only duty for training", "Never served"),
y = c(202241,4757340,888360,52953129),
name = "good marriage performance",
type = "bar")
p2 <- add_trace(
p,
x = c("No duty", "Duty in pust", "Only duty for training", "Never served"),
y = c(50390,2541718,409714,19377482),
name = "poor marriage performance",
type = "bar")
p3 <- add_trace(
p2,
x = c("No duty", "Duty in pust", "Only duty for training", "Never served"),
y = c(7198,1047573,147901,5411315),
name = "inferior marriage performance",
type = "bar")
p4 <- layout(p3, barmode = "stack")
p4
Parallel coordinates is a visualization technique used to plot individual data elements across many dimensions. Each of the dimensions corresponds to a vertical axis and each data element is displayed as a series of connected points along the dimensions/axes. This technique can have an explicit explanation of the corresponding categorical parameters.
## Parallel coordinates plots
subdata = as.data.frame(subdata)
subdata_new = subdata[,c(1,2,3,4,5,7)]
scpcp(subdata_new, sel="data[,6]", sel.palette = "w")